param_dict = {
"concat_scenes": False,
"extract_direc": True,
"remove_punct": False,
"rmv_stopwords": False,
"lwr": False,
"exp_contractions": False,
"conversion": None
}
df = preprocess(df_raw, **param_dict)
df.head()
| season | episode | scene | line_text | speaker | season_episode | directorials | |
|---|---|---|---|---|---|---|---|
| id | |||||||
| 1 | 1 | 1 | 1 | All right Jim. Your quarterlies look very good... | Michael | 101 | NaN |
| 2 | 1 | 1 | 1 | Oh, I told you. I couldn't close it. So... | Jim | 101 | NaN |
| 3 | 1 | 1 | 1 | So you've come to the master for guidance? Is ... | Michael | 101 | NaN |
| 4 | 1 | 1 | 1 | Actually, you called me in here, but yeah. | Jim | 101 | NaN |
| 5 | 1 | 1 | 1 | All right. Well, let me show you how it's done. | Michael | 101 | NaN |
fig = px.bar(char_app_count_sorted, x='speaker', y='counts', color='season', color_discrete_sequence=px.colors.qualitative.Prism, title='Scene appearances per character')
fig.update_xaxes(categoryorder='array', categoryarray= top_char)
fig = px.bar(lines_per_character.sort_values("season"), x="speaker", y="line_text", color='season', color_discrete_sequence=px.colors.qualitative.Prism, title='Lines per character')
fig.update_xaxes(categoryorder='array', categoryarray= top20_characters)
fig.update_yaxes(title='number of lines')
fig = px.bar(words_per_character.sort_values("season"), x="speaker", y="word_count", color='season', color_discrete_sequence=px.colors.qualitative.Prism, title='Words per character')
fig.update_xaxes(categoryorder='array', categoryarray= top20_characters)
fig.update_yaxes(title='number of words')
fig.show()
param_dict_tokens = {
"concat_scenes": False,
"extract_direc": True,
"remove_punct": True,
"rmv_stopwords": False,
"lwr": True,
"exp_contractions": True,
"conversion": "tokenize",
"tokenizer": ("TreeBankWord", True, PATH+"character_names.csv", PATH+"compound_words_the-office_by_chatgpt.txt")
}
df_tokens = preprocess(df_raw, **param_dict_tokens)
df_tokens.head()
| season | episode | scene | line_text | speaker | season_episode | directorials | |
|---|---|---|---|---|---|---|---|
| id | |||||||
| 1 | 1 | 1 | 1 | [all, right, jim, your, quarterlies, look, ver... | Michael | 101 | NaN |
| 2 | 1 | 1 | 1 | [oh, i, told, you, i, could, not, close, it, so] | Jim | 101 | NaN |
| 3 | 1 | 1 | 1 | [so, you, have, come, to, the, master, for, gu... | Michael | 101 | NaN |
| 4 | 1 | 1 | 1 | [actually, you, called, me, in, here, but, yeah] | Jim | 101 | NaN |
| 5 | 1 | 1 | 1 | [all, right, well, let, me, show, you, how, it... | Michael | 101 | NaN |
all_words = [item for sublist in df_tokens["line_text"].tolist() for item in sublist]
all_words_freq = nltk.FreqDist(all_words)
df_all_words_freq = pd.Series(dict(all_words_freq)).sort_values(ascending=False)
wordcloud = WordCloud(width=800, height=300, background_color="white", max_words=100, contour_width=3, contour_color='steelblue').generate(" ".join(all_words))
wordcloud.to_image()
fig = px.bar(y=df_all_words_freq[:16].index, x=df_all_words_freq[:16].values, orientation='h', title='Most common words', height=450)
fig.update_layout(yaxis_title='Word', xaxis_title='Frequency')
fig.show()
New preprocessing: additionally remove stopwords
fig2 = px.bar(y=df_all_words_freq[:16].index, x=df_all_words_freq[:16].values, orientation='h', title='Most common words (after stopword removal)', height=450)
fig2.show()
param_dict_tokens_nostopwords = {
"concat_scenes": False,
"extract_direc": True,
"remove_punct": False,
"rmv_stopwords": False,
"lwr": True,
"exp_contractions": False,
"conversion": "pos_tag"
}
df_tokens_tagged = preprocess(df_raw, **param_dict_tokens_nostopwords)
df_tokens_tagged.head()
| season | episode | scene | line_text | speaker | season_episode | directorials | |
|---|---|---|---|---|---|---|---|
| id | |||||||
| 1 | 1 | 1 | 1 | [(all, DT), (right, JJ), (jim., NN), (your, PR... | Michael | 101 | NaN |
| 2 | 1 | 1 | 1 | [(oh, UH), (,, ,), (i, JJ), (told, VBD), (you.... | Jim | 101 | NaN |
| 3 | 1 | 1 | 1 | [(so, RB), (you, PRP), ('ve, VBP), (come, VBN)... | Michael | 101 | NaN |
| 4 | 1 | 1 | 1 | [(actually, RB), (,, ,), (you, PRP), (called, ... | Jim | 101 | NaN |
| 5 | 1 | 1 | 1 | [(all, DT), (right., NN), (well, RB), (,, ,), ... | Michael | 101 | NaN |
all_tagged_freq = nltk.FreqDist(all_words_tagged_filtered_jj)
df_all_tagged_freq = pd.Series(dict(all_tagged_freq)).sort_values(ascending=False).drop('i')
fig2 = px.bar(y=df_all_tagged_freq[:10].index, x=df_all_tagged_freq[:10].values, orientation='h', title='Most common Adjectives', height=450)
fig2.update_traces(width=0.5)
fig2.show()
all_tagged_freq = nltk.FreqDist(all_words_tagged_filtered_nn)
df_all_tagged_freq = pd.Series(dict(all_tagged_freq)).sort_values(ascending=False)
fig3 = px.bar(y=df_all_tagged_freq[:10].index, x=df_all_tagged_freq[:10].values, orientation='h', title='Most common Nouns', height=450)
fig3.update_traces(width=0.5)
fig3.show()
to determine important words in the dataset
features_tfidf_agg[0:10]
you 0.072911 michael 0.059165 is 0.056919 to 0.054235 the 0.052029 it 0.047500 dwight 0.045659 jim 0.045000 that 0.040082 pam 0.039464 dtype: float64
target_words = ['birthday', 'wedding', 'scranton', 'stamford', 'philly', 'dundie', 'farm', 'boat', 'manager']
fig = plt.figure(figsize=(10,4))
visualizer = DispersionPlot(target_words, ax=fig.add_subplot(111))
visualizer.fit([all_words])
visualizer.show();
finder_2 = nltk.collocations.BigramCollocationFinder.from_words(all_words)
finder_3 = nltk.collocations.TrigramCollocationFinder.from_words(all_words)
finder_3.ngram_fd.most_common(18)
[(('let', 'us', 'go'), 203),
(('let', 'us', 'get'), 107),
(('hey', 'hey', 'hey'), 78),
(('whoa', 'whoa', 'whoa'), 63),
(('oh', 'god', 'oh'), 59),
(('na', 'na', 'na'), 55),
(('let', 'us', 'see'), 52),
(('go', 'go', 'go'), 49),
(('one', 'two', 'three'), 45),
(('stop', 'stop', 'stop'), 44),
(('god', 'oh', 'god'), 41),
(('come', 'let', 'us'), 39),
(('blah', 'blah', 'blah'), 37),
(('ha', 'ha', 'ha'), 35),
(('okay', 'let', 'us'), 34),
(('yes', 'yes', 'yes'), 33),
(('wait', 'wait', 'wait'), 32),
(('get', 'back', 'work'), 31)]
fig.show()
df_ngrams_michael_dwight
| ngram Michael | ngram Dwight | |
|---|---|---|
| 0 | ((let, us, go), 75) | ((let, us, go), 38) |
| 1 | ((let, us, get), 35) | ((hey, hey, hey), 21) |
| 2 | ((hey, hey, hey), 26) | ((yes, yes, yes), 17) |
| 3 | ((come, let, us), 23) | ((let, us, get), 15) |
| 4 | ((oh, god, oh), 21) | ((ha, ha, ha), 13) |
| 5 | ((beep, beep, beep), 21) | ((go, go, go), 13) |
| 6 | ((let, us, see), 18) | ((jim, jim, jim), 12) |
| 7 | ((god, oh, god), 17) | ((wait, wait, wait), 10) |
| 8 | ((stop, stop, stop), 16) | ((whoa, whoa, whoa), 9) |
| 9 | ((na, na, na), 16) | ((la, la, la), 9) |
| 10 | ((go, let, us), 13) | ((let, us, see), 7) |
| 11 | ((yeah, yeah, yeah), 13) | ((michael, michael, michael), 7) |
| 12 | ((right, right, right), 13) | ((zero, zero, zero), 7) |
| 13 | ((blah, blah, blah), 13) | ((volunteer, sheriffs, deputy), 6) |
| 14 | ((go, go, go), 13) | ((one, two, three), 6) |
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("All right Jim. Your quarterlies look very good. How are things at the library?")
{'neg': 0.0, 'neu': 0.803, 'pos': 0.197, 'compound': 0.4927}
# display sentiment over time for a given season and episode
fig = px.line(df_rolling[(df_rolling["season"] == 7) & (df_rolling['episode'] >= 21) & (df_rolling['episode'] < 22)], x="id", y=["neg", "neu", "pos"], title="Sentiment over time", color_discrete_sequence=['rgb(213,94,0)', 'rgb(240,228,66)', 'rgb(0,158,115)'], height=300)
fig.show()
Additionally, SpacyTextBlob and a transformer model for sentiment analysis (cardiffnlp/twitter-roberta-base-sentiment) was evaluated.
pd.DataFrame(emotion_analysis)[["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]].rolling(20).mean().plot(cmap="Dark2", figsize=(12,4)); plt.xlabel("lines"); plt.ylabel("degree"); plt.title("Emotions throughout the series: Jim & Dwight", fontsize=18)
plt.show()
pd.DataFrame(emotion_analysis)[["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]].rolling(20).mean().plot(cmap="Dark2", figsize=(12,4))
plt.xlabel("lines")
plt.ylabel("degree")
plt.title("Emotions throughout the series: Jim & Pam", fontsize=18)
plt.legend(frameon=True, framealpha=.8)
plt.show()
Approaches:
New approach: BERTopic
topics, probs = topic_model.fit_transform(data_unp)
# show the results (more than 900 topics)
topic_model.visualize_topics()
# show the results (only first 10 topics)
topic_model.visualize_barchart([1,2,3,4,32,50,105,106])
topic_model.reduce_topics(data_unp, nr_topics=30)
topic_model.visualize_topics()
topic_model.visualize_barchart([1,5,16,21], width=225)
topic_model.visualize_topics_over_time(topics_over_time, topics=[5,7,14], width=960, height=400)